Normalize reads and create DGE objects
library(tidyverse)
Registered S3 method overwritten by 'dplyr':
method from
print.rowwise_df
Registered S3 methods overwritten by 'dbplyr':
method from
print.tbl_lazy
print.tbl_sql
[30m── [1mAttaching packages[22m ────────────────────────────────────────────────────────── tidyverse 1.3.0 ──[39m
[30m[32m✓[30m [34mggplot2[30m 3.2.1 [32m✓[30m [34mpurrr [30m 0.3.3
[32m✓[30m [34mtibble [30m 2.1.3 [32m✓[30m [34mdplyr [30m 0.8.3
[32m✓[30m [34mtidyr [30m 1.0.0 [32m✓[30m [34mstringr[30m 1.4.0
[32m✓[30m [34mreadr [30m 1.3.1 [32m✓[30m [34mforcats[30m 0.4.0[39m
[30m── [1mConflicts[22m ───────────────────────────────────────────────────────────── tidyverse_conflicts() ──
[31mx[30m [34mdplyr[30m::[32mfilter()[30m masks [34mstats[30m::filter()
[31mx[30m [34mdplyr[30m::[32mlag()[30m masks [34mstats[30m::lag()[39m
library(edgeR)
Loading required package: limma
files <- dir("../input/Kallisto_output/", include.dirs = TRUE)
files %>% head()
[1] "1a1_q_002_S1_R1_001" "1a2_q_003_S2_R1_001" "1a3_q_004_S3_R1_001"
[4] "1a4_q_005_S4_R1_001" "1a5_q_006_S5_R1_001" "1a6_q_007_S6_R1_001"
counts.list <- map(files, ~ read_tsv(
file=file.path("..","input","Kallisto_output",.,"abundance.tsv"),
col_types = "cdddd"))
names(counts.list) <- files
counts <- sapply(counts.list, select, est_counts) %>%
bind_cols(counts.list[[1]][,"target_id"],.)
counts[is.na(counts)] <- 0
colnames(counts) <- sub(".est_counts","",colnames(counts),fixed = TRUE)
counts
write_csv(counts,"../output/2018-timecourse_V3.0_raw_counts_.csv.gz")
key <- readxl::read_excel("../input/tube_no_legend_time_course_2018.xlsx",
na=c("","na"),
col_types=c("text", "text", "text", "skip", "text", "skip", "skip", "skip", "text", "text", "text", "skip", "text", "skip", "skip", "date", "date")) %>%
mutate(sampling_time_specific=format(sampling_time_specific, format="%H:%M:%S"))
key
create reformatted tube_no
key <- key %>%
mutate(tube_no_2 = {
tolower(tube_no) %>%
str_replace("q_([1-9](_|$))", "q_00\\1") %>%
str_replace("q_([1-9][0-9](_|$))", "q_0\\1") }) %>%
select(tube_no, tube_no_2, everything())
key
samples <- tibble(
file=files,
tube_no_2 = str_extract(files, pattern = "q_[0-9]{3}(_d8)?")
)
samples
samples <- left_join(samples, key)
Joining, by = "tube_no_2"
samples <- samples %>%
mutate(sampling_day=str_pad(sampling_day,width=2,side = "left",pad="0")) %>%
mutate(group=str_c(genotype, soil_trt, sampling_day, sampling_time,sep="-"))
samples
counts2 <- counts %>%
as.data.frame() %>%
column_to_rownames(var = "target_id") %>%
as.matrix() %>%
round(0)
samples2 <- samples %>%
select(-tube_no_2, -tube_no, -pot, -plant_no, -sampling_day_specific, -sampling_time_specific) %>%
as.data.frame() %>%
column_to_rownames(var="file")
dge <- DGEList(counts=counts2, samples=samples2, group=samples2$group)
dge <- calcNormFactors(dge)
save(dge, file="../output/timecourseDGE.Rdata")
load("../output/timecourseDGE.Rdata")
cpm averaged for each sample type
log2cpmGroup <- cpmByGroup(dge, log = TRUE)
dim(log2cpmGroup)
[1] 46250 48
head(log2cpm[,1:10])
FPsc-ATM_BLANK-02-2_afternoon FPsc-ATM_BLANK-03-1_morn
BraA01g000010.3C 0.1228981 0.1855259
BraA01g000020.3C 4.5040649 4.1498411
BraA01g000030.3C -1.3215286 -0.6842086
BraA01g000040.3C 6.0727831 5.9570869
BraA01g000050.3C 2.0488744 1.5780617
BraA01g000060.3C -1.3241428 -1.4374041
FPsc-ATM_BLANK-03-2_afternoon FPsc-ATM_BLANK-03-3_evening_5.30
BraA01g000010.3C -0.2656371 -0.3678073
BraA01g000020.3C 4.2173450 4.0387983
BraA01g000030.3C -0.4147637 -0.3646628
BraA01g000040.3C 6.0349576 5.9312680
BraA01g000050.3C 1.5568859 1.9381266
BraA01g000060.3C -1.4374041 -1.4374041
FPsc-ATM_BLANK-03-4_night_1 FPsc-ATM_BLANK-03-5_night_2 FPsc-ATM_BLANK-04-1_morn
BraA01g000010.3C -0.4365975 -0.6444735 -0.8937118
BraA01g000020.3C 4.0717934 4.0001420 4.2911430
BraA01g000030.3C -1.1149994 -0.3966245 -0.5600769
BraA01g000040.3C 5.7669761 5.8282151 5.9890498
BraA01g000050.3C 1.7998456 1.8459871 1.6437956
BraA01g000060.3C -1.4374041 -1.4374041 -1.4374041
FPsc-ATM_BLANK-04-2_afternoon FPsc-ATM_BLANK-04-3_evening_5.30
BraA01g000010.3C 0.3708927 -0.5266988
BraA01g000020.3C 3.9806886 4.2305824
BraA01g000030.3C -0.8354973 -0.8281699
BraA01g000040.3C 5.9081725 5.7867749
BraA01g000050.3C 1.5984162 1.8688470
BraA01g000060.3C -1.4374041 -1.4374041
FPsc-ATM_BLANK-04-4_night_1
BraA01g000010.3C -0.03054253
BraA01g000020.3C 3.91593441
BraA01g000030.3C -0.75851324
BraA01g000040.3C 5.91802906
BraA01g000050.3C 1.90403532
BraA01g000060.3C -1.43740410
write.csv(log2cpmGroup, "../output/log2cpmGroup.csv.gz")
cpm for each individual sample
samplenames <- str_c(dge$samples$group,"_blk",dge$samples$block)
log2cpmSample <- cpmByGroup(dge, log = TRUE, group=samplenames)
dim(log2cpmSample)
[1] 46250 288
head(log2cpmSample[,1:10])
FPsc-ATM_BLANK-02-2_afternoon_blk1 FPsc-ATM_BLANK-02-2_afternoon_blk2
BraA01g000010.3C 0.3400216 0.384717
BraA01g000020.3C 4.5764433 4.411531
BraA01g000030.3C -1.4374041 -1.437404
BraA01g000040.3C 6.0943903 6.243121
BraA01g000050.3C 1.9168970 1.595641
BraA01g000060.3C -1.4374041 -1.437404
FPsc-ATM_BLANK-02-2_afternoon_blk3 FPsc-ATM_BLANK-02-2_afternoon_blk4
BraA01g000010.3C -1.4374041 0.09091967
BraA01g000020.3C 4.7686054 4.37711591
BraA01g000030.3C -1.4374041 -0.73415423
BraA01g000040.3C 5.9496220 6.04456997
BraA01g000050.3C 2.6011030 1.75891783
BraA01g000060.3C -0.8695907 -1.43740410
FPsc-ATM_BLANK-02-2_afternoon_blk5 FPsc-ATM_BLANK-02-2_afternoon_blk6
BraA01g000010.3C -0.03366926 0.5637431
BraA01g000020.3C 4.64860469 4.1661298
BraA01g000030.3C -1.43740410 -1.4374041
BraA01g000040.3C 6.03998090 6.0467807
BraA01g000050.3C 2.49466558 1.6145669
BraA01g000060.3C -1.43740410 -1.4374041
FPsc-ATM_BLANK-03-1_morn_blk1 FPsc-ATM_BLANK-03-1_morn_blk2
BraA01g000010.3C 0.5251546 -0.5241447
BraA01g000020.3C 3.9196502 3.8473212
BraA01g000030.3C -1.4374041 -0.2202086
BraA01g000040.3C 5.8183833 6.0247640
BraA01g000050.3C 1.8394763 1.9236065
BraA01g000060.3C -1.4374041 -1.4374041
FPsc-ATM_BLANK-03-1_morn_blk3 FPsc-ATM_BLANK-03-1_morn_blk4
BraA01g000010.3C 0.3096726 1.115971
BraA01g000020.3C 4.3493457 3.695601
BraA01g000030.3C -0.8802583 -1.437404
BraA01g000040.3C 5.8443365 5.830659
BraA01g000050.3C 0.9527679 1.834303
BraA01g000060.3C -1.4374041 -1.437404
write.csv(log2cpmSample, "../output/log2cpmSample.csv.gz")